rm(list=ls())
setwd("~/Dropbox/merit_project")
#source('./code/functions.R')
#source('./code/census_code/census_cleaning_functions.R')

library(tidyverse)
library(ipumsr)
library(labelled)


ipums_clean_1850_1930_long <- function(x, pos) {
  x %>%
    mutate(race_recode = case_when(RACE == 1 ~ "white",
                                   RACE == 2 ~ "black",
                                   RACE > 2 ~ "other"),
           birthplace_recode_first = case_when(BPL < 120 ~ "native_born",
                                               BPL > 120 ~ "foreign_born"),
           birthplace_recode_foreign = case_when(BPL == 465 ~ "russian",
                                                 BPL == 434 ~ "italian",
                                                 BPL == 414 ~ "irish",
                                                 BPL %in% c(450, 453) ~ "german",
                                                 BPL == 455 ~ "polish",
                                                 TRUE ~ "other"),
           parent_birthplace_recode = case_when(BPL < 120 & MBPL < 120 & FBPL < 120 ~ 'native_born',
                                                BPL < 120 & (MBPL==465|FBPL==465) ~ "russian",
                                                BPL < 120 & (MBPL==434|FBPL==434) ~ "italian",
                                                BPL < 120 & (MBPL==414|FBPL==414) ~ "irish",
                                                BPL < 120 & (MBPL %in% c(450, 453)|FBPL %in% c(450, 453)) ~ "german",
                                                BPL < 120 & (MBPL==455|FBPL==455) ~ "polish",
                                                TRUE ~ "other"),
           literacy_recode = case_when(LIT == 4 ~ "literate",
                                       LIT != 4 ~ "nonliterate",
                                       TRUE ~ NA_character_),
           english_recode = case_when(SPEAKENG %in% c(2, 3, 4, 5) ~ "english",
                                      SPEAKENG %in% c(0, 1, 6, 7, 8) ~ "nonenglish",
                                      TRUE ~ NA_character_),
           industry_recode = case_when(IND1950==936 ~ "local",
                                       IND1950==926 ~ "state",
                                       IND1950==916 ~ "federal",
                                       IND1950==906 ~ 'postal',
                                       IND1950==946 ~ 'pa_ns',
                                       IND1950==888 ~ 'education',
                                       IND1950 %in% c(586:598) ~ 'utilities',
                                       TRUE ~ "other"),
           occ_recode = case_when(OCC1950 == 773 ~ "police",
                                  OCC1950 %in% c(542, 680, 762) ~ "fireman",
                                  OCC1950 %in% c(4, 31, 57, 93) ~ "teacher",
                                  !(OCC1950 %in% c(773,542, 680, 762, 4, 31, 57, 93)) & OCC1950 < 500 ~ "white_collar",
                                  !(OCC1950 %in% c(773,542, 680, 762, 4, 31, 57, 93)) & !(OCC1950 %in% seq(980, 999, 1)) & OCC1950 > 499 ~ "blue_collar",
                                  TRUE ~ NA_character_)) %>%
    group_by(CITY, YEAR, race_recode, birthplace_recode_first, birthplace_recode_foreign, 
             parent_birthplace_recode, literacy_recode, english_recode, industry_recode, occ_recode) %>%
    summarise(num = n()) %>%
    ungroup()

}


ipums_clean_1940_long <- function(x, pos) {
  x %>%
    mutate(race_recode = case_when(RACE == 1 ~ "white",
                                   RACE == 2 ~ "black",
                                   RACE > 2 ~ "other"),
           birthplace_recode_first = case_when(BPL < 120 ~ "native_born",
                                               BPL > 120 ~ "foreign_born"),
           birthplace_recode_foreign = case_when(BPL == 465 ~ "russian",
                                                 BPL == 434 ~ "italian",
                                                 BPL == 414 ~ "irish",
                                                 BPL %in% c(450, 453) ~ "german",
                                                 BPL == 455 ~ "polish",
                                                 TRUE ~ "other"),
           parent_birthplace_recode = case_when(BPL < 120 & MBPL < 120 & FBPL < 120 ~ 'native_born',
                                                BPL < 120 & (MBPL==465|FBPL==465) ~ "russian",
                                                BPL < 120 & (MBPL==434|FBPL==434) ~ "italian",
                                                BPL < 120 & (MBPL==414|FBPL==414) ~ "irish",
                                                BPL < 120 & (MBPL %in% c(450, 453)|FBPL %in% c(450, 453)) ~ "german",
                                                BPL < 120 & (MBPL==455|FBPL==455) ~ "polish",
                                                TRUE ~ "other"),
           industry_recode = case_when(IND1950==936 ~ "local",
                                       IND1950==926 ~ "state",
                                       IND1950==916 ~ "federal",
                                       IND1950==906 ~ 'postal',
                                       IND1950==946 ~ 'pa_ns',
                                       IND1950==888 ~ 'education',
                                       IND1950 %in% c(586:598) ~ 'utilities',
                                       TRUE ~ "other"),
           occ_recode = case_when(OCC1950 == 773 ~ "police",
                                  OCC1950 %in% c(542, 680, 762) ~ "fireman",
                                  OCC1950 %in% c(4, 31, 57, 93) ~ "teacher",
                                  !(OCC1950 %in% c(773,542, 680, 762, 4, 31, 57, 93)) & OCC1950 < 500 ~ "white_collar",
                                  !(OCC1950 %in% c(773,542, 680, 762, 4, 31, 57, 93)) & !(OCC1950 %in% seq(980, 999, 1)) & OCC1950 > 499 ~ "blue_collar",
                                  TRUE ~ NA_character_)) %>%
    group_by(CITY, YEAR, race_recode, birthplace_recode_first, birthplace_recode_foreign, 
             parent_birthplace_recode, industry_recode, occ_recode) %>%
    summarise(num = n()) %>%
    ungroup()
}


data_1850_1910 <- read_ipums_micro_chunked(
  ddi = ("./census_data/ddi_files/usa_00030.xml"),
  IpumsDataFrameCallback$new(ipums_clean_1850_1930_long),
  chunk_size = 100000,
  vars = c(CITY, YEAR, RACE, BPL, MBPL, FBPL, IND1950, OCC1950, SPEAKENG, LIT))

city_labels1910 = 
  read_ipums_ddi("./census_data/ddi_files/usa_00030.xml") %>%
  ipums_val_labels(., var = CITY) %>% 
  mutate(city = str_to_lower(lbl)) %>%
  select(CITY=val, city)

data_1850_1910 = left_join(data_1850_1910, city_labels1910)



data_1920 <- read_ipums_micro_chunked(
  ddi = ("./census_data/ddi_files/usa_00047.xml"),
  IpumsDataFrameCallback$new(ipums_clean_1850_1930_long),
  chunk_size = 100000,
  vars = c(CITY, YEAR, RACE, BPL, MBPL, FBPL, IND1950,OCC1950, SPEAKENG, LIT))

city_labels1920 = 
  read_ipums_ddi("./census_data/ddi_files/usa_00047.xml") %>%
  ipums_val_labels(., var = CITY) %>% 
  mutate(city = str_to_lower(lbl)) %>%
  select(CITY=val, city)

data_1920 = left_join(data_1920, city_labels1920)


data_1930 <- read_ipums_micro_chunked(
  ddi = ("./census_data/ddi_files/usa_00046.xml"),
  IpumsDataFrameCallback$new(ipums_clean_1850_1930_long),
  chunk_size = 100000,
  vars = c(CITY, YEAR, RACE, BPL, MBPL, FBPL, IND1950,OCC1950, SPEAKENG, LIT))

city_labels1930 = 
  read_ipums_ddi("./census_data/ddi_files/usa_00046.xml") %>%
  ipums_val_labels(., var = CITY) %>% 
  mutate(city = str_to_lower(lbl)) %>%
  select(CITY=val, city)

data_1930 = left_join(data_1930, city_labels1930)




data_1940 <- read_ipums_micro_chunked(
  ddi = ("./census_data/ddi_files/usa_00045.xml"),
  IpumsDataFrameCallback$new(ipums_clean_1940_long),
  chunk_size = 100000,
  vars = c(CITY, YEAR, RACE, BPL, MBPL, FBPL,OCC1950, IND1950))

city_labels1940 = 
  read_ipums_ddi("./census_data/ddi_files/usa_00045.xml") %>%
  ipums_val_labels(., var = CITY) %>% 
  mutate(city = str_to_lower(lbl)) %>%
  select(CITY=val, city)

data_1940 = left_join(data_1940, city_labels1940)


#getting city labels from 1940 census

data_stack =
  bind_rows(data_1850_1910, data_1920) %>%
  bind_rows(., data_1930) %>%
  bind_rows(., data_1940)


##getting the year of reforms and merging

reforms = 
  read_csv("./data/cities_reform.csv") %>%
  mutate(State = tolower(State),
         City = tolower(City)) %>%
  left_join(.,
            read_csv("./data/states.csv") %>%
              mutate(State = tolower(State),
                     Abbreviation = tolower(Abbreviation)),
            by = "State") %>%
  mutate(city = paste0(City, ", ", Abbreviation)) %>%
  select(civil_reform = Year, city, police_only, fire_only) %>%
  mutate(city = trimws(city),
         police_only = ifelse(is.na(police_only), 0, police_only),
         fire_only = ifelse(is.na(fire_only), 0, fire_only),
         type = case_when(police_only==1 ~ 'police',
                          fire_only==1 ~ 'fire',
                          TRUE ~ 'all'),
         civil_reform = as.numeric(civil_reform)) %>%
  group_by(city, type) %>%
  summarise(civil_reform=min(civil_reform)) %>%
  spread(type, civil_reform) %>%
  mutate(police = ifelse(is.na(police), all, police),
         fire = ifelse(is.na(fire), all, fire)) %>%
  mutate_at(vars(police:all), function(x){as.numeric(x)})

#write.csv(reforms, "./data/reforms_years.csv")

data_stack = left_join(data_stack, reforms, by = "city") %>%
  mutate(treat = case_when(occ_recode == 'police' & YEAR > police ~ 1,
                           occ_recode == 'police' & YEAR <= police ~ 0,
                           occ_recode == 'fireman' & YEAR > fire ~ 1,
                           occ_recode == 'fireman' & YEAR <= fire ~ 0,
                           occ_recode %in% c('blue_collar', 'white_collar', 'teacher') & YEAR > all ~ 1,
                           occ_recode %in% c('blue_collar', 'white_collar', 'teacher') & YEAR <= all ~ 0,
                           TRUE ~ NA_real_))

write.csv(data_stack, "./data/census_stack_long_new_bc.csv")


